library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
# Working with Spatial Data
library(sf)
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(rmapshaper)
## Registered S3 method overwritten by 'geojsonlint':
## method from
## print.location dplyr
# Embedding HTML Widgets
library(htmltools)
library(leaflet)
Import daily covid cases and deaths for the UK from the Public Health England coronavirus data api.
uk_daily_covid19_cases <- read_csv("https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=newCasesByPublishDate&format=csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## areaCode = col_character(),
## areaName = col_character(),
## areaType = col_character(),
## date = col_date(format = ""),
## newCasesByPublishDate = col_double()
## )
uk_daily_covid19_deaths <- read_csv("https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=newOnsDeathsByRegistrationDate&format=csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## areaCode = col_character(),
## areaName = col_character(),
## areaType = col_character(),
## date = col_date(format = ""),
## newOnsDeathsByRegistrationDate = col_double()
## )
glimpse(uk_daily_covid19_cases)
## Rows: 92,171
## Columns: 5
## $ areaCode <chr> "E06000003", "E06000014", "E06000050", "E0800000~
## $ areaName <chr> "Redcar and Cleveland", "York", "Cheshire West a~
## $ areaType <chr> "utla", "utla", "utla", "utla", "utla", "utla", ~
## $ date <date> 2021-07-18, 2021-07-18, 2021-07-18, 2021-07-18,~
## $ newCasesByPublishDate <dbl> 323, 136, 293, 253, 266, 180, 519, 131, 289, 133~
glimpse(uk_daily_covid19_deaths)
## Rows: 11,484
## Columns: 5
## $ areaCode <chr> "E06000003", "E06000014", "E06000050", ~
## $ areaName <chr> "Redcar and Cleveland", "York", "Cheshi~
## $ areaType <chr> "utla", "utla", "utla", "utla", "utla",~
## $ date <date> 2021-07-02, 2021-07-02, 2021-07-02, 20~
## $ newOnsDeathsByRegistrationDate <dbl> 1, 0, 1, 4, 0, 1, 5, 0, 0, 0, 0, 0, 0, ~
uk_daily_covid19_deaths %>%
select(areaCode) %>%
unique() %>%
filter(str_detect(areaCode, "^E")) %>%
nrow()
## [1] 149
Eng_daily_covid19_indicators <- uk_daily_covid19_cases %>%
full_join(uk_daily_covid19_deaths) %>%
filter(str_detect(areaCode, "^E")) %>%
replace_na(list(newOnsDeathsByRegistrationDate = 0,
newCasesByPublishDate = 0))
## Joining, by = c("areaCode", "areaName", "areaType", "date")
Data has 92171 rows and 5 columns.
Dates: 2020-04-17 to 2021-07-18.
It assigns covid cases by the date the data was first included in published totals, not when the initial test was taken.
The data is at Upper Tier Local Authority level, of which 149 are in England.
Data has 11484 rows and 5 columns.
Dates: 2020-03-13 to 2021-07-02.
It assigns deaths to covid if COVID-19 is mentioned as a cause on the death certificate, while the date refers to the date the death was registered.
The data is at Upper Tier Local Authority level, of which 149 are in England.
mid_year_population_2020 <- read_excel("data/ukpopestimatesmid2020on2020geography.xlsx",
sheet = "MYE4",
skip = 7) %>%
select("areaCode" = Code, "areaName" = Name, Geography, "populationMid2020" = `Mid-2020`) %>%
filter(str_detect(areaCode, "^E")) %>%
filter(Geography %in% c("Unitary Authority", "Metropolitan District", "County", "London Borough"))
utla_rural_classification <- read_excel("data/utla_ruc.xlsx",
skip = 2) %>%
rename("areaCode" = `UTLA19 CD`,
"areaName" = `UTLA19 NM`,
"detailedRuralClassification" = RUC11,
"broadRuralClassification" = `Broad RUC11`)
utla_data <- mid_year_population_2020 %>%
left_join(utla_rural_classification) %>%
replace_na(list(detailedRuralClassification = "Urban with Significant Rural",
broadRuralClassification = "Urban with Significant Rural")) %>%
mutate(areaName = str_replace(areaName, "City of London", "Hackney and City of London") %>%
str_replace(., "Hackney$", "Hackney and City of London") %>%
str_replace(., "Isles of Scilly", "Cornwall and Isles of Scilly") %>%
str_replace(., "Cornwall$", "Cornwall and Isles of Scilly"),
areaCode = str_replace(areaCode, "E09000001", "E09000012"),
areaCode = str_replace(areaCode, "E06000053", "E06000052"),
areaCode = str_replace(areaCode, "E06000060", "E10000002")) %>%
group_by(areaCode, areaName, Geography, detailedRuralClassification, broadRuralClassification) %>%
summarise(populationMid2020 = sum(populationMid2020)) %>%
ungroup()
## Joining, by = c("areaCode", "areaName")
## `summarise()` has grouped output by 'areaCode', 'areaName', 'Geography', 'detailedRuralClassification'. You can override using the `.groups` argument.
glimpse(utla_data)
## Rows: 149
## Columns: 6
## $ areaCode <chr> "E06000001", "E06000002", "E06000003", "E0~
## $ areaName <chr> "Hartlepool", "Middlesbrough", "Redcar and~
## $ Geography <chr> "Unitary Authority", "Unitary Authority", ~
## $ detailedRuralClassification <chr> "Urban with City and Town", "Urban with Ci~
## $ broadRuralClassification <chr> "Predominantly Urban", "Predominantly Urba~
## $ populationMid2020 <dbl> 93836, 141285, 137228, 197419, 107402, 129~
In order to compare the data between local authorities, we must normalise the data for population level, as a local authority with a higher population would naturally accumulate more covid cases and deaths, ceterus-paribus.
I have imported the rural urban classification for upper tier local authorities, which is constructed using the same method as the local authority district rural urban classification. Areas with a rural population over 50% are classified as predominantly rural, and areas with a rural population below 25% are classified as predominantly urban.
detailed_ruc_order <- c("Mainly Rural", "Largely Rural", "Urban with Significant Rural", "Urban with City and Town", "Urban with Minor Conurbation", "Urban with Major Conurbation")
overall_data <- Eng_daily_covid19_indicators %>%
left_join(utla_data) %>%
group_by(detailedRuralClassification) %>%
summarise(TotalCases = sum(newCasesByPublishDate),
TotalDeaths = sum(newOnsDeathsByRegistrationDate),
nLAD = n_distinct(areaCode)) %>%
mutate(detailedRuralClassification = factor(detailedRuralClassification, detailed_ruc_order)) %>%
arrange(detailedRuralClassification) %>%
left_join(utla_data %>%
group_by(detailedRuralClassification) %>%
summarise(pop2020 = sum(populationMid2020),
nLADcheck = n_distinct(areaCode))) %>%
ungroup() %>%
mutate(OverallCaseRate = TotalCases/pop2020*100000,
OverallDeathRate = TotalDeaths/pop2020*100000)
## Joining, by = c("areaCode", "areaName")
## Joining, by = "detailedRuralClassification"
# overall_data %>%
# ggplot(aes(OverallCaseRate, detailedRuralClassification)) +
# geom_col()
A clear pattern in the overall rates of covid cases when each local authority is aggregated by rural classification.
The overall case rate for the most rural areas is 3945 cases per 100,000 population, while the overall case rate for the most urban areas is 9816 cases per 100,000 population.
The covid rate of the most urban areas is 2.5 times the covid rate in the most rural areas.
However, the overall death rate had less clear pattern. The most rural areas had the a rate of 124 deaths registered per 100,000 population, while the overall death rate for the most urban areas had the 3rd highest death rate of 241 deaths registered per 100,000 population. The highest death rate came from Urban areas with Minor Conurbations with a death rate of 264 deaths registered per 100,000 population.
The covid death rate of the most urban areas is 1.9 times the covid death rate in the most rural areas.
UTLA_shp <- st_read("spatial data/COunties_and_Unitary_Authorities_(December_2019)_Boundaries_UK_BFC.shp") %>%
rmapshaper::ms_simplify()
## Reading layer `Counties_and_Unitary_Authorities_(December_2019)_Boundaries_UK_BFC' from data source `C:\Users\spoic\OneDrive\Documents\R\Projects\Data-Science-Graduate-Scheme\spatial data\Counties_and_Unitary_Authorities_(December_2019)_Boundaries_UK_BFC.shp' using driver `ESRI Shapefile'
## Simple feature collection with 216 features and 10 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -116.1928 ymin: 5337.901 xmax: 655653.8 ymax: 1220302
## projected CRS: OSGB 1936 / British National Grid
UTLA_cases_deaths <- Eng_daily_covid19_indicators %>%
group_by(areaCode, areaName) %>%
summarise(TotalCases = sum(newCasesByPublishDate),
TotalDeaths = sum(newOnsDeathsByRegistrationDate),
nLAD = n_distinct(areaCode)) %>%
left_join(utla_data) %>%
ungroup() %>%
mutate(OverallCaseRate = TotalCases/populationMid2020*100000,
OverallDeathRate = TotalDeaths/populationMid2020*100000)
## `summarise()` has grouped output by 'areaCode'. You can override using the `.groups` argument.
## Joining, by = c("areaCode", "areaName")
UTLA_ruc_shp <- left_join(UTLA_shp, UTLA_cases_deaths, by = c("ctyua19cd" = "areaCode")) %>%
st_transform(4326) %>% # Convert from a UK Projection (epsg = 27700) to a Global Projection (epsg = 4326)
mutate(OverallCaseRate_comma = scales::comma(OverallCaseRate, accuracy = 1),
OverallDeathRate_comma = scales::comma(OverallDeathRate, accuracy = 1))
# select(lad11cd, lad11nm, ruc11, broad_ruc11, st_areasha)
bins <- c(0, 1000, 2000, 4000, 6000, 8000, 10000, Inf)
pal <- colorBin("YlOrRd", domain = UTLA_ruc_shp$OverallCaseRate, bins = bins)
labels <- sprintf(
"<strong>%s</strong><br/>%s<strong><br/>%s",
UTLA_ruc_shp$areaName, UTLA_ruc_shp$OverallCaseRate_comma, UTLA_ruc_shp$detailedRuralClassification
) %>% lapply(htmltools::HTML)
UTLA_ruc_shp %>%
leaflet() %>%
addTiles() %>%
addPolygons(
stroke = T,
smoothFactor = 1,
fillOpacity = 0.7,
fillColor = ~pal(OverallCaseRate),
color = "white",
weight = 2,
dashArray = "1",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"))
bins <- c(0, 50, 100, 150, 200, 250, 300, Inf)
pal <- colorBin("YlOrRd", domain = UTLA_ruc_shp$OverallDeathRate, bins = bins)
labels <- sprintf(
"<strong>%s</strong><br/>%s<strong><br/>%s",
UTLA_ruc_shp$areaName, UTLA_ruc_shp$OverallDeathRate_comma, UTLA_ruc_shp$detailedRuralClassification
) %>% lapply(htmltools::HTML)
UTLA_ruc_shp %>%
leaflet() %>%
addTiles() %>%
addPolygons(
stroke = T,
smoothFactor = 1,
fillOpacity = 0.7,
fillColor = ~pal(OverallDeathRate),
color = "white",
weight = 2,
dashArray = "1",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"))